import os
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import warnings
warnings.filterwarnings("ignore")
Dossier contenant toutes les données
DirPath = "./data/"
data = pd.read_csv(DirPath + "data.csv")
genre_data = pd.read_csv(DirPath + "data_by_genres.csv")
year_data = pd.read_csv(DirPath + "data_by_year.csv")
data
| valence | year | acousticness | artists | danceability | duration_ms | energy | explicit | id | instrumentalness | key | liveness | loudness | mode | name | popularity | release_date | speechiness | tempo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0594 | 1921 | 0.98200 | ['Sergei Rachmaninoff', 'James Levine', 'Berli... | 0.279 | 831667 | 0.211 | 0 | 4BJqT0PrAfrxzMOxytFOIz | 0.878000 | 10 | 0.6650 | -20.096 | 1 | Piano Concerto No. 3 in D Minor, Op. 30: III. ... | 4 | 1921 | 0.0366 | 80.954 |
| 1 | 0.9630 | 1921 | 0.73200 | ['Dennis Day'] | 0.819 | 180533 | 0.341 | 0 | 7xPhfUan2yNtyFG0cUWkt8 | 0.000000 | 7 | 0.1600 | -12.441 | 1 | Clancy Lowered the Boom | 5 | 1921 | 0.4150 | 60.936 |
| 2 | 0.0394 | 1921 | 0.96100 | ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi... | 0.328 | 500062 | 0.166 | 0 | 1o6I8BglA6ylDMrIELygv1 | 0.913000 | 3 | 0.1010 | -14.850 | 1 | Gati Bali | 5 | 1921 | 0.0339 | 110.339 |
| 3 | 0.1650 | 1921 | 0.96700 | ['Frank Parker'] | 0.275 | 210000 | 0.309 | 0 | 3ftBPsC5vPBKxYSee08FDH | 0.000028 | 5 | 0.3810 | -9.316 | 1 | Danny Boy | 3 | 1921 | 0.0354 | 100.109 |
| 4 | 0.2530 | 1921 | 0.95700 | ['Phil Regan'] | 0.418 | 166693 | 0.193 | 0 | 4d6HGyGT8e121BsdKmw9v6 | 0.000002 | 3 | 0.2290 | -10.096 | 1 | When Irish Eyes Are Smiling | 2 | 1921 | 0.0380 | 101.665 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 170648 | 0.6080 | 2020 | 0.08460 | ['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna... | 0.786 | 301714 | 0.808 | 0 | 0KkIkfsLEJbrcIhYsCL7L5 | 0.000289 | 7 | 0.0822 | -3.702 | 1 | China | 72 | 2020-05-29 | 0.0881 | 105.029 |
| 170649 | 0.7340 | 2020 | 0.20600 | ['Ashnikko'] | 0.717 | 150654 | 0.753 | 0 | 0OStKKAuXlxA0fMH54Qs6E | 0.000000 | 7 | 0.1010 | -6.020 | 1 | Halloweenie III: Seven Days | 68 | 2020-10-23 | 0.0605 | 137.936 |
| 170650 | 0.6370 | 2020 | 0.10100 | ['MAMAMOO'] | 0.634 | 211280 | 0.858 | 0 | 4BZXVFYCb76Q0Klojq4piV | 0.000009 | 4 | 0.2580 | -2.226 | 0 | AYA | 76 | 2020-11-03 | 0.0809 | 91.688 |
| 170651 | 0.1950 | 2020 | 0.00998 | ['Eminem'] | 0.671 | 337147 | 0.623 | 1 | 5SiZJoLXp3WOl3J4C8IK0d | 0.000008 | 2 | 0.6430 | -7.161 | 1 | Darkness | 70 | 2020-01-17 | 0.3080 | 75.055 |
| 170652 | 0.6420 | 2020 | 0.13200 | ['KEVVO', 'J Balvin'] | 0.856 | 189507 | 0.721 | 1 | 7HmnJHfs0BkFzX4x8j0hkl | 0.004710 | 7 | 0.1820 | -4.928 | 1 | Billetes Azules (with J Balvin) | 74 | 2020-10-16 | 0.1080 | 94.991 |
170653 rows × 19 columns
On dispose de plusieurs parametre tel que le pourcentage d'accoustique, d'instrumentale, le tempo, la dansabilité, par morceau.
genre_data
| mode | genres | acousticness | danceability | duration_ms | energy | instrumentalness | liveness | loudness | speechiness | tempo | valence | popularity | key | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 21st century classical | 0.979333 | 0.162883 | 1.602977e+05 | 0.071317 | 0.606834 | 0.361600 | -31.514333 | 0.040567 | 75.336500 | 0.103783 | 27.833333 | 6 |
| 1 | 1 | 432hz | 0.494780 | 0.299333 | 1.048887e+06 | 0.450678 | 0.477762 | 0.131000 | -16.854000 | 0.076817 | 120.285667 | 0.221750 | 52.500000 | 5 |
| 2 | 1 | 8-bit | 0.762000 | 0.712000 | 1.151770e+05 | 0.818000 | 0.876000 | 0.126000 | -9.180000 | 0.047000 | 133.444000 | 0.975000 | 48.000000 | 7 |
| 3 | 1 | [] | 0.651417 | 0.529093 | 2.328809e+05 | 0.419146 | 0.205309 | 0.218696 | -12.288965 | 0.107872 | 112.857352 | 0.513604 | 20.859882 | 7 |
| 4 | 1 | a cappella | 0.676557 | 0.538961 | 1.906285e+05 | 0.316434 | 0.003003 | 0.172254 | -12.479387 | 0.082851 | 112.110362 | 0.448249 | 45.820071 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2968 | 1 | zolo | 0.222625 | 0.547082 | 2.580991e+05 | 0.610240 | 0.143872 | 0.204206 | -11.295878 | 0.061088 | 125.494919 | 0.596155 | 33.778943 | 9 |
| 2969 | 0 | zouglou | 0.161000 | 0.863000 | 2.063200e+05 | 0.909000 | 0.000000 | 0.108000 | -5.985000 | 0.081300 | 119.038000 | 0.845000 | 58.000000 | 7 |
| 2970 | 1 | zouk | 0.263261 | 0.748889 | 3.060728e+05 | 0.622444 | 0.257227 | 0.089678 | -10.289222 | 0.038778 | 101.965222 | 0.824111 | 46.666667 | 5 |
| 2971 | 0 | zurich indie | 0.993000 | 0.705667 | 1.984173e+05 | 0.172667 | 0.468633 | 0.179667 | -11.453333 | 0.348667 | 91.278000 | 0.739000 | 0.000000 | 7 |
| 2972 | 1 | zydeco | 0.421038 | 0.629409 | 1.716717e+05 | 0.609369 | 0.019248 | 0.255877 | -9.854825 | 0.050491 | 126.366087 | 0.808544 | 30.261905 | 7 |
2973 rows × 14 columns
On dispose aussi de plusieurs parametre tel que le pourcentage d'accoustique, d'instrumentale, le tempo, la dansabilité, par genre musicaux.
year_data
| mode | year | acousticness | danceability | duration_ms | energy | instrumentalness | liveness | loudness | speechiness | tempo | valence | popularity | key | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1921 | 0.886896 | 0.418597 | 260537.166667 | 0.231815 | 0.344878 | 0.205710 | -17.048667 | 0.073662 | 101.531493 | 0.379327 | 0.653333 | 2 |
| 1 | 1 | 1922 | 0.938592 | 0.482042 | 165469.746479 | 0.237815 | 0.434195 | 0.240720 | -19.275282 | 0.116655 | 100.884521 | 0.535549 | 0.140845 | 10 |
| 2 | 1 | 1923 | 0.957247 | 0.577341 | 177942.362162 | 0.262406 | 0.371733 | 0.227462 | -14.129211 | 0.093949 | 114.010730 | 0.625492 | 5.389189 | 0 |
| 3 | 1 | 1924 | 0.940200 | 0.549894 | 191046.707627 | 0.344347 | 0.581701 | 0.235219 | -14.231343 | 0.092089 | 120.689572 | 0.663725 | 0.661017 | 10 |
| 4 | 1 | 1925 | 0.962607 | 0.573863 | 184986.924460 | 0.278594 | 0.418297 | 0.237668 | -14.146414 | 0.111918 | 115.521921 | 0.621929 | 2.604317 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | 1 | 2016 | 0.284171 | 0.600202 | 221396.510295 | 0.592855 | 0.093984 | 0.181170 | -8.061056 | 0.104313 | 118.652630 | 0.431532 | 59.647190 | 0 |
| 96 | 1 | 2017 | 0.286099 | 0.612217 | 211115.696787 | 0.590421 | 0.097091 | 0.191713 | -8.312630 | 0.110536 | 117.202740 | 0.416476 | 63.263554 | 1 |
| 97 | 1 | 2018 | 0.267633 | 0.663500 | 206001.007133 | 0.602435 | 0.054217 | 0.176326 | -7.168785 | 0.127176 | 121.922308 | 0.447921 | 63.296243 | 1 |
| 98 | 1 | 2019 | 0.278299 | 0.644814 | 201024.788096 | 0.593224 | 0.077640 | 0.172616 | -7.722192 | 0.121043 | 120.235644 | 0.458818 | 65.256542 | 1 |
| 99 | 1 | 2020 | 0.219931 | 0.692904 | 193728.397537 | 0.631232 | 0.016376 | 0.178535 | -6.595067 | 0.141384 | 124.283129 | 0.501048 | 64.301970 | 1 |
100 rows × 14 columns
Enfin On dispose de plusieurs parametre tel que le pourcentage d'accoustique, d'instrumentale, le tempo, la dansabilité, par années.
C est pourquoi on va s'interessé a ces diffenrentes variables
En utilisant les données regroupées par année, nous pouvons comprendre comment la musique a changé de 1921 à 2020.
def get_date(year):
start = int(year/10) * 10
decenie = "{}s".format(start)
return decenie
data["decenie"] = data["year"].apply(get_date)
sns.set(rc={"figure.figsize":(11 ,6)})
sns.countplot(data["decenie"])
<AxesSubplot:xlabel='decenie', ylabel='count'>
sound_features = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "valence"]
figure = px.line(year_data, x="year", y=sound_features)
figure.show()
A partir de 1980, les differentes variable commencent à se stabilisé à travers le temps
Voici le top 10, des genres les plus écoutés
top_10_genres = genre_data.nlargest(10, "popularity")
figure = px.bar(top_10_genres, x="genres", y=["valence", "energy", "danceability", "acousticness"], barmode="group")
figure.show()
musique_cluster = Pipeline([("scaler", StandardScaler()),
("kmeans", KMeans(n_clusters=20,
verbose=False))
], verbose=False)
X = data.select_dtypes(np.number)
cl = list(X.columns)
musique_cluster.fit(X)
musique_cluster_labels = musique_cluster.predict(X)
data["cluster_label"] = musique_cluster_labels
from sklearn.decomposition import PCA
pca_pipeline = Pipeline([("scaler", StandardScaler()), ("PCA", PCA(n_components=2))])
s = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=["x", "y"], data=s)
projection["title"] = data["name"]
projection["cluster"] = data["cluster_label"]
figure = px.scatter(
projection, x="x", y="y", color="cluster", hover_data=["x", "y", "title"])
figure.show()